{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "de6537c4",
   "metadata": {},
   "source": [
    "# Faithfulness Evaluator\n",
    "\n",
    "This notebook uses the `FaithfulnessEvaluator` module to measure if the response from a query engine matches any source nodes.  \n",
    "This is useful for measuring if the response was hallucinated.  \n",
    "The data is extracted from the [New York City](https://en.wikipedia.org/wiki/New_York_City) wikipedia page."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c45de6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-llms-openai pandas[jinja2] spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a8304f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# attach to the same event-loop\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "190a6684",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import (\n",
    "    VectorStoreIndex,\n",
    "    SimpleDirectoryReader,\n",
    "    Response,\n",
    ")\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core.evaluation import FaithfulnessEvaluator\n",
    "from llama_index.core.node_parser import SentenceSplitter\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_colwidth\", 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe66f2a",
   "metadata": {},
   "source": [
    "Using GPT-4 here for evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9b98f89-d5b8-4d29-92f6-ad76d5060e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpt-4\n",
    "gpt4 = OpenAI(temperature=0, model=\"gpt-4\")\n",
    "\n",
    "evaluator_gpt4 = FaithfulnessEvaluator(llm=gpt4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\"./test_wiki_data/\").load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41f0e53f-77a6-40d5-94ae-3f81b01af75c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create vector index\n",
    "splitter = SentenceSplitter(chunk_size=512)\n",
    "vector_index = VectorStoreIndex.from_documents(\n",
    "    documents, transformations=[splitter]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af730b2e-6949-4865-b7af-bb2bc60a9173",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core.evaluation import EvaluationResult\n",
    "\n",
    "\n",
    "# define jupyter display function\n",
    "def display_eval_df(response: Response, eval_result: EvaluationResult) -> None:\n",
    "    if response.source_nodes == []:\n",
    "        print(\"no response!\")\n",
    "        return\n",
    "    eval_df = pd.DataFrame(\n",
    "        {\n",
    "            \"Response\": str(response),\n",
    "            \"Source\": response.source_nodes[0].node.text[:1000] + \"...\",\n",
    "            \"Evaluation Result\": \"Pass\" if eval_result.passing else \"Fail\",\n",
    "            \"Reasoning\": eval_result.feedback,\n",
    "        },\n",
    "        index=[0],\n",
    "    )\n",
    "    eval_df = eval_df.style.set_properties(\n",
    "        **{\n",
    "            \"inline-size\": \"600px\",\n",
    "            \"overflow-wrap\": \"break-word\",\n",
    "        },\n",
    "        subset=[\"Response\", \"Source\"]\n",
    "    )\n",
    "    display(eval_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "400f486d",
   "metadata": {},
   "source": [
    "To run evaluations you can call the `.evaluate_response()` function on the `Response` object return from the query to run the evaluations. Lets evaluate the outputs of the vector_index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "180a5d2e-9286-477b-9cd0-a5976d18d845",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_engine = vector_index.as_query_engine()\n",
    "response_vector = query_engine.query(\"How did New York City get its name?\")\n",
    "eval_result = evaluator_gpt4.evaluate_response(response=response_vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c764b8b3-69b1-4ac8-b88b-3f9e204b8bfb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_21e1a_row0_col0, #T_21e1a_row0_col1 {\n",
       "  inline-size: 600px;\n",
       "  overflow-wrap: break-word;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_21e1a\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_21e1a_level0_col0\" class=\"col_heading level0 col0\" >Response</th>\n",
       "      <th id=\"T_21e1a_level0_col1\" class=\"col_heading level0 col1\" >Source</th>\n",
       "      <th id=\"T_21e1a_level0_col2\" class=\"col_heading level0 col2\" >Evaluation Result</th>\n",
       "      <th id=\"T_21e1a_level0_col3\" class=\"col_heading level0 col3\" >Reasoning</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_21e1a_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
       "      <td id=\"T_21e1a_row0_col0\" class=\"data row0 col0\" >New York City got its name when it came under British control in 1664. It was renamed New York after King Charles II of England granted the lands to his brother, the Duke of York.</td>\n",
       "      <td id=\"T_21e1a_row0_col1\" class=\"data row0 col1\" >The city came under British control in 1664 and was renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York City has emerged as a global node of creativity, entrepreneurship, and as a symbol of freedom and cultural diversity. The New York Times has won the most Pulitzer Prizes for journalism and remains the U.S. media's \"newspaper of record\". In 2019, New York City was voted the greatest city in the world in a survey of over 30,000 p...</td>\n",
       "      <td id=\"T_21e1a_row0_col2\" class=\"data row0 col2\" >Pass</td>\n",
       "      <td id=\"T_21e1a_row0_col3\" class=\"data row0 col3\" >YES</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x169353d10>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_eval_df(response_vector, eval_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50895fe4",
   "metadata": {},
   "source": [
    "## Benchmark on Generated Question\n",
    "\n",
    "Now lets generate a few more questions so that we have more to evaluate with and run a small benchmark."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90a8cd4d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/loganmarkewich/giant_change/llama_index/llama-index-core/llama_index/core/evaluation/dataset_generation.py:212: DeprecationWarning: Call to deprecated class DatasetGenerator. (Deprecated in favor of `RagDatasetGenerator` which should be used instead.)\n",
      "  return cls(\n",
      "/Users/loganmarkewich/giant_change/llama_index/llama-index-core/llama_index/core/evaluation/dataset_generation.py:309: DeprecationWarning: Call to deprecated class QueryResponseDataset. (Deprecated in favor of `LabelledRagDataset` which should be used instead.)\n",
      "  return QueryResponseDataset(queries=queries, responses=responses_dict)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['What is the population of New York City as of 2020?',\n",
       " 'Which city is the second-largest in the United States?',\n",
       " 'How many people live within 250 miles of New York City?',\n",
       " 'What are the five boroughs of New York City?',\n",
       " 'What is the gross metropolitan product of the New York metropolitan area?']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from llama_index.core.evaluation import DatasetGenerator\n",
    "\n",
    "question_generator = DatasetGenerator.from_documents(documents)\n",
    "eval_questions = question_generator.generate_questions_from_nodes(5)\n",
    "\n",
    "eval_questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "810ee913",
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "\n",
    "\n",
    "def evaluate_query_engine(query_engine, questions):\n",
    "    c = [query_engine.aquery(q) for q in questions]\n",
    "    results = asyncio.run(asyncio.gather(*c))\n",
    "    print(\"finished query\")\n",
    "\n",
    "    total_correct = 0\n",
    "    for r in results:\n",
    "        # evaluate with gpt 4\n",
    "        eval_result = (\n",
    "            1 if evaluator_gpt4.evaluate_response(response=r).passing else 0\n",
    "        )\n",
    "        total_correct += eval_result\n",
    "\n",
    "    return total_correct, len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a7ca4b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finished query\n",
      "score: 5/5\n"
     ]
    }
   ],
   "source": [
    "vector_query_engine = vector_index.as_query_engine()\n",
    "correct, total = evaluate_query_engine(vector_query_engine, eval_questions[:5])\n",
    "\n",
    "print(f\"score: {correct}/{total}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
